if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')## [conflicted] Will prefer dplyr::intersect over any other package
library(ggrepel)mytheme = theme_bw(base_size = 10) +
theme(text = element_text(size=10, colour='black'),
title = element_text(size=10, colour='black'),
line = element_line(size=0.5),
axis.title = element_text(size=10, colour='black'),
axis.text = element_text(size=10, colour='black'),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(size=0.5),
strip.background = element_blank(),
strip.text.y = element_blank(),
panel.grid = element_blank(),
panel.border = element_blank(),
legend.position = c(0.8, 0.8),
legend.text=element_text(size=10))
mytheme_discrete_x = mytheme +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
# a colorblind-friendly palette
# colorblind.palette.grey = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")all_circ = read_tsv('../data/Supplementary_Table_2_all_circRNAs.txt')## Rows: 1137099 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (6): start, end, BSJ_count, n_detected, n_db, estim_len_in
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')## Rows: 1560 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (20): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_4_precision_values.txt')## Rows: 29 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (14): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val$tool = factor(val$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
all_circcqvalfirst how many per group in selected circ?
nr_detected = all_circ %>%
group_by(chr, start, end, circ_id, cell_line) %>%
summarise(n_detected = n()) %>%
ungroup()## `summarise()` has grouped output by 'chr', 'start', 'end', 'circ_id'. You can
## override using the `.groups` argument.
val_df = cq %>% left_join(nr_detected) %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, n_detected, cell_line) %>% unique() %>%
pivot_longer(cols = c(qPCR_val, RR_val, amp_seq_val), names_to = "val_type", values_to = "val")## Joining, by = c("chr", "start", "end", "circ_id", "cell_line", "n_detected")
val_dfval_df$val_type = factor(val_df$val_type, levels = c('qPCR_val', 'RR_val', 'amp_seq_val'))
val_df %>%
ggplot(aes(n_detected, fill = val)) +
geom_bar() +
facet_grid(~val_type) +
mytheme +
scale_fill_manual(values = c('#CC79A7', '#00A875' ,'#999999')) +
ylab('number of circRNAs') +
xlab('number of tools the circRNAs was detected by')## Warning: Removed 54 rows containing non-finite values (stat_count).
#ggsave('separate_figures/figure_4A.pdf', width = 21, height = 8.5, units = "cm")simple_union = read_tsv('../data/Supplementary_Table_5_combo_2tools.txt')## Rows: 675 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool_1, tool_2, cell_line, count_group, tool_lt_1, tool_lt_2, tool_...
## dbl (7): nr_union, nr_intersection, perc_compound_val_1, perc_compound_val_2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
simple_unionadd total nr of circ for that cell line
total_cl = all_circ %>%
group_by(cell_line) %>%
filter(count_group == 'count ≥ 5') %>%
select(circ_id, cell_line) %>%
unique() %>%
count(cell_line) %>%
rename(total_cell_line = n) %>% ungroup()
total_clunion_sub = simple_union %>%
# filter based on percentage increase
left_join(total_cl) %>%
filter((nr_union - pmax(total_n_1, total_n_2))/total_cell_line > 0.075) %>%
#filter(nr_union - pmax(total_n_1, total_n_2) > 999) %>%
filter(perc_compound_val_1 >= 0.9,
perc_compound_val_2 >= 0.9)## Joining, by = "cell_line"
add individual tools of interest
union_sub = union_sub %>%
bind_rows(simple_union %>%
filter(tool_1 == tool_2,
tool_1 %in% (union_sub %>% pull(tool_1, tool_2) %>% unique())) %>%
left_join(total_cl) )## Joining, by = "cell_line"
remove doubles
union_sub = union_sub %>%
group_by(tool_combo, cell_line) %>%
sample_n(1) %>%
ungroup()
union_subtry as percentage of all circ in that cell line
union_sub %>%
left_join(all_circ %>% select(circ_id, cell_line, count_group) %>%
filter(count_group == "count ≥ 5") %>%
unique() %>% count(cell_line) %>% rename(total_n = n)) %>%
mutate(perc_union = nr_union/total_n) %>%
#filter(cell_line == "HLF") %>%
ggplot(aes(w_val_rate, perc_union)) +
geom_point(aes(color = (tool_1 == tool_2))) +
geom_text_repel(aes(label=str_remove(tool_combo, '-'), color = (tool_1 == tool_2)), max.overlaps = 20, size = 3) +
mytheme +
theme(legend.position = 'NA') +
facet_wrap(~cell_line) +
scale_color_manual(values = c('#E69F00', '#0072B2')) +
scale_x_continuous(labels = scales::percent_format()) +
scale_y_continuous(labels = scales::percent_format()) +
xlab('validation rate') +
ylab('number of circRNAs')## Joining, by = "cell_line"
#ggsave('figure_4B.pdf', width = 12, height = 10, units = "cm")
#ggsave('sup_figure_24.pdf', width = 20, height = 20, units = "cm")check mean increase in perc
simple_union %>%
filter(count_group == "count ≥ 5",
!tool_1 == tool_2,
perc_compound_val_1 >= 0.9,
perc_compound_val_2 >= 0.9) %>%
mutate(increase = nr_union - total_n_1,
increase_prec = increase/total_n_1) %>% #view()
pull(increase_prec) %>%
quantile()## 0% 25% 50% 75% 100%
## 0.0000000 0.1844617 0.3599387 1.0824845 3.2251773